Setting the options for knitr…
library(knitr)
knitr::opts_chunk$set(results = 'asis',
echo = TRUE,
comment = NA,
prompt = FALSE,
cache = FALSE,
warning = FALSE,
message = FALSE,
fig.align = "center",
fig.width = 8.125,
out.width = "100%",
fig.path = "Figures/figures_analysis_perfs/",
dev = c('png', 'tiff'),
dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
cache = TRUE,
cache.path = "D:/cache/cache_analysis_perf/",
autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')Loading libraries…
library(tidyverse) # The tidy verse
library(reshape2)
library(caret) # To ease classification training
library(gridExtra)
library(mlr)
library(rlist)
library(kableExtra)
library(summarytools)
library(glmnet)Cleaning the memory…
rm(list = ls(all.names = TRUE))We define a ‘not in’ operator…
'%!in%' <- Negate("%in%")We load data tables and definitions of features sets prepared during the curation phase
load(file = "Study_objects.RData")We define two vectors to rank individuals and call types by decreasing order of number of calls…
decreasing_order_nb_calls_by_individual <- df %>%
group_by(individual) %>%
summarize(n = n()) %>%
arrange(-n) %>%
pull(individual) %>%
as.character()
decreasing_order_nb_calls_by_type <- df %>%
group_by(type) %>%
summarize(n = n()) %>%
arrange(-n) %>%
pull(type) %>%
as.character()We also define tables which contain the number of calls per individual, and per call type…
df_individual <- df %>%
pull(individual) %>%
table() %>%
as_tibble() %>%
arrange(-n) %>%
rename(Individual = names(.)[1])
df_type <- df %>%
pull(type) %>%
table() %>%
as_tibble() %>%
arrange(-n) %>%
rename(Type = names(.)[1])Since we have to analyze the performances of various classifiers i) for both individuals (i.e. individual signatures) and (call) types and ii) with different features sets, it makes sense to define a number of generic functions which we will call repeatedly later on.
We first define a function define_classifier_type_and_configuration() which takes a data table and adds two additional columns to it: one column to define the type of classifier (single classifier or ensemble classifier), and another column to define the configuration, which is the intersection of the classifier and the feature set…
define_classifier_type_and_configuration <- function(df) {
df <- df %>%
mutate(classifier_type = ifelse(algo == "ensemble", "ensemble", "single classifier"),
classifier_type = as.factor(classifier_type),
configuration = paste0(algo, " - ", set),
configuration = as.factor(configuration))
return (df)
}We then define the function load_and_prepare_data(), which takes as parameter i) the name of a file which contains containing classification results for a target variable and a management strategy for sequences of calls, and ii) a vector of strings of characters describing an order for the levels of the target variable. The function loads and further prepares the data…
load_and_prepare_data <- function(filename, level_ordering, all_odd_even = "all") {
# Loading the data
load(filename)
if (all_odd_even == "odd")
all_outputs <- all_outputs[seq(1, length(all_outputs), 2)]
if (all_odd_even == "even")
all_outputs <- all_outputs[seq(2, length(all_outputs), 2)]
# Extracting various data from the list of results
performances <- all_outputs %>% purrr::map_dfr("performances")
confusion <- all_outputs %>% purrr::map_dfr("confusion")
fimp <- all_outputs %>% purrr::map_dfr("fimp")
sets <- all_outputs %>% purrr::map("sets")
sequence_scores <- sets %>% purrr::map_dbl("sequence_score")
training_sets <- sets %>% purrr::map("training_ids")
test_sets <- sets %>% purrr::map("testing_ids")
# Renaming the metrics/measures for easier identification
measure_names <- c("acc" = "accuracy", "auc"= "AUC", "bac" = "balanced accuracy", "kappa" = "Kappa", "logloss" = "log loss", "mmce" = "mean misclassification error")
performances <- performances %>%
mutate(measure_name = as.factor(measure_name),
measure_name = recode(measure_name, !!!measure_names))
# Adding the factors classifier_type and configuration
performances <- performances %>% define_classifier_type_and_configuration()
confusion <- confusion %>% define_classifier_type_and_configuration()
if (nrow(fimp) > 0) # Only if feature importance was computed
fimp <- fimp %>% define_classifier_type_and_configuration()
# Computing average values for performances, confusion data and feature importance (for all the iterations)
averaged_performances <- performances %>%
group_by(algo, set, measure_name, configuration, classifier_type) %>%
summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
ungroup()
averaged_confusion <- confusion %>%
group_by(algo, set, prediction, reference) %>%
summarize(n = mean(n), .groups = "drop") %>%
ungroup()
averaged_fimp <- NULL
if (nrow(fimp) > 0)
averaged_fimp <- fimp %>%
group_by(algo, set, feature) %>%
summarize(mean = mean(imp), sd = sd(imp), .groups = "drop") %>%
ungroup()
# We relevel the factors prediction and reference in confusion and averaged_confusion
confusion <- confusion %>% mutate(prediction = factor(prediction, levels = level_ordering),
reference = factor(reference, levels = level_ordering))
averaged_confusion <- averaged_confusion %>% mutate(prediction = factor(prediction, levels = level_ordering),
reference = factor(reference, levels = level_ordering))
# We return a list of objects
return (list(training_sets = training_sets,
test_sets = test_sets,
performances = performances,
averaged_performances = averaged_performances,
confusion = confusion,
averaged_confusion = averaged_confusion,
fimp = fimp,
averaged_fimp = averaged_fimp,
sequence_scores = sequence_scores))
}The function get_histograms_9_configurations() create a faceted figure which contains 9 histograms - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC) - summarizing the classification performances as defined by a target metric/measure…
get_histograms_9_configurations <- function(performances_table, target_measure) {
p <- performances_table %>%
filter(algo != "ensemble", measure_name == target_measure) %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 20) +
geom_density() +
facet_grid(algo ~ set) +
xlab(target_measure) +
labs(title = paste0("Histograms of ", target_measure, " values"), subtitle = "Different classifiers and feature sets")
return (p)
}The function get_barplots_comparison_9_configurations() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for 9 configurations - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC)…
get_barplots_comparison_9_configurations <- function(performances_table, target_measure) {
title <- paste0("Performances with the ", target_measure, " metric")
p <- performances_table %>%
filter(algo != "ensemble", measure_name == target_measure) %>%
ggplot(aes(x = set, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
labs(title = title, subtitle = "Different classifiers and feature sets", fill = "Classifier") +
theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Feature set")
return (p)
}The function get_barplots_comparison_all_configurations_by_classifier_type() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for all configurations present in the provided table, with a 2-color scheme corresponding to whether a classifier is an ensemble one or not…
get_barplots_comparison_all_configurations_by_classifier_type <- function(performances_table, target_measure) {
title <- paste0("Performances with the ", target_measure, " metric")
p <- performances_table %>%
filter(measure_name == target_measure) %>%
ggplot(aes(x = reorder(configuration, mean), y = mean, fill = classifier_type)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
labs(title = title, subtitle = "Different configurations", fill = "Classifier type") +
theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Configuration") +
ggtitle(title)
return (p)
}The function get_barplots_comparison_all_configurations_by_classifier_type() creates a barplot which displays the average classification performances - as assessed by a target measure/metric - for all configurations present in the provided table, with a color scheme representing the different feature sets…
get_barplots_comparison_all_configurations_by_set <- function(performances_table, target_measure) {
title <- paste0("Performances with the ", target_measure, " metric")
p <- performances_table %>%
filter(measure_name == target_measure) %>%
ggplot(aes(x = reorder(configuration, mean), y = mean, fill = set)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", position = position_dodge(width = 0.9)) +
labs(title = title, subtitle = "Different configurations", fill = "Feature set") +
theme_minimal() + coord_flip() + ylab(target_measure) + xlab("Configuration") +
ggtitle(title)
return (p)
}The function get_confusion_matrix() creates a well-formatted figure to display the confusion matrix for a specific classifier (algo) and a specific feature set. One can choose whether to display raw figures in addition to row-wise percentages in the cells of the matrix…
get_confusion_matrix <- function(confusion_table, target_algo, target_set, display_raw_numbers = T, title = "", font_size = 3.5, axis_font_size = 10, axis_title_font_size = 10, title_font_size = 12) {
if (title == "")
title <- paste0("Confusion matrix - ", target_algo, " and ", target_set)
confusion_table <- confusion_table %>%
filter(algo == target_algo, set == target_set) %>%
select(-algo, -set)
cf_t <- confusion_table %>%
group_by(reference) %>% summarize(n_ref = sum(n)) %>%
ungroup()
confusion_table <- confusion_table %>%
inner_join(cf_t, by = c("reference" = "reference")) %>%
mutate(percent = 100 * n / n_ref)
if (display_raw_numbers)
confusion_table <- confusion_table %>% mutate(legend = paste0(round(n, 1), " (", round(percent, 1), " %)"))
else
confusion_table <- confusion_table %>% mutate(legend = paste0(round(percent, 1), " %"))
p <- confusion_table %>%
ggplot(aes(x = prediction, y = reference, fill = percent)) +
geom_tile(show.legend = FALSE) +
geom_text(aes(label = legend), size = font_size) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = axis_font_size),
axis.text.y = element_text(size = axis_font_size),
axis.title = element_text(size = axis_title_font_size),
title = element_text(size = title_font_size, face='bold')) +
scale_fill_distiller(palette = "YlOrBr", direction = 1) +
ggtitle(title)
return (p)
}The function get_confusion_matrix_9_configurations() creates a faceted figure to display 9 confusion matrices corresponding to 9 different configurations - 3 classifiers (SVM, NN, xgboot) x 3 feature sets(Bioacoustic, DCT, MFCC). One can choose whether to display raw figures in addition to row-wise percentages in the cells of the matrices…
get_confusion_matrix_9_configurations <- function(confusion_table) {
p <- confusion_table %>%
filter(algo != "ensemble") %>%
ggplot(aes(x = prediction, y = reference, fill = n)) +
geom_tile(show.legend = FALSE) +
#geom_text(aes(label = round(n, 1))) +
theme(axis.text.x=element_text(angle = 45, hjust = 1)) +
facet_grid(set ~ algo) +
scale_fill_distiller(palette = "YlOrBr", direction = 1) +
ggtitle("Confusion matrices for the different configurations")
return (p)
}The function get_fimp_barplot() creates a barplot to display the importance of the different features of a given feature set, for a given classifier…
get_fimp_barplot <- function(fimp_table, target_algo, target_set, axis_font_size = 10, axis_title_font_size = 10, title_font_size = 12) {
title <- paste0(target_set, " - ", target_algo)
p <- fimp_table %>%
filter(algo == target_algo, set == target_set) %>% droplevels() %>%
ggplot(aes(x = reorder(feature, mean), y = mean)) +
geom_bar(stat = "identity", col = "black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
# geom_text(aes(label = round(mean, 3)), hjust = 1.8, color = "black", size = value_text_size) +
xlab("Feature") + ylab("Importance") +
theme_minimal() + coord_flip() +
theme(axis.text = element_text(size = axis_font_size),
axis.title = element_text(size = axis_title_font_size),
title = element_text(size = title_font_size, face='bold')) +
ggtitle(title)
return (p)
}We first load the results of the assessments of the different models, and create different data tables…
outputs_individual <- load_and_prepare_data("Result files/results - individual - default.RData", decreasing_order_nb_calls_by_individual)We have conducted 100 repetitions of the same sequence: i) create a training-test partition randomly, ii) train a model with the training set, iii) assess the model with the test set. We can look at the distribution of values of the different metrics across the repetitions.
For logloss:
outputs_individual$performances %>% get_histograms_9_configurations("log loss")While the histograms and density curves look sometimes not very regular, one can imagine that more repetitions would gradually lead to a smooth and symmetrical distribution, possibly a Gaussian one. The range of variation of the logloss values is quite significant in each configuration, which suggests it was a good idea to consider a number of repetitions.
For multi-class AUC, balanced accuracy and accuracy:
outputs_individual$performances %>% get_histograms_9_configurations("AUC")outputs_individual$performances %>% get_histograms_9_configurations("balanced accuracy")outputs_individual$performances %>% get_histograms_9_configurations("accuracy")We reach the same conclusions as previously with log loss.
We first focus on log loss, which is the metric which was used for the tuning of the hyper-parameters of our different classifiers. We display the average performances for each classifier and each set of predictors:
outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("log loss")
We can make a number of observations:
We can confirm these results with two other metrics: multi-class AUC and balanced accuracy…
p1 <- outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("AUC")
p2 <- outputs_individual$averaged_performances %>% get_barplots_comparison_9_configurations("balanced accuracy")
grid.arrange(p1, p2, ncol = 2)
While AUC gives nearly the same ranking between the 9
conditions as log loss, there are some differences with
balanced accuracy. We can, however, still conclude that
whatever the measure, xgboostand MFCC
offer the best performances, especially when considered together.
DCT parameters, on the contrary, lead to the worst
performances.
We do not observe any specific pattern for the standard deviations.
We can display as a reminder the individuals ranked by the number of calls for each of them:
df_individual %>% mutate(n = text_spec(n, color = "white", align = "r", background = spec_color(n, option = "D", scale_from = c(0, 400)))) %>%
kbl(digits = 3, escape = F) %>%
kable_classic() %>%
kable_styling(full_width = F)| Individual | n |
|---|---|
| Jill | 362 |
| Zuani | 257 |
| Zamba | 193 |
| Vifijo | 160 |
| Djanoa | 145 |
| Hortense | 111 |
| Kumbuka | 109 |
| Bolombo | 76 |
| Lina | 76 |
| Busira | 71 |
We can display the confusion matrix for our best case scenario, when xgboost is considered with MFCC:
outputs_individual$averaged_confusion %>% get_confusion_matrix("xgboost", "MFCC", display_raw_numbers = F)
We observe that the more calls for an individual, the better it tends to
be correctly recognized (although the trend is not linear), despite
accounting to class unbalance with weights. ‘Heavier’ individuals (in
terms of number of calls) also tend to attract classification outputs,
although it is really clearly the case only for Jill.
If one pays attention to the individuals with the smallest number of calls, Bolombo is the most poorly classified, while the situation is a bit better for Busira, Kubumka and especially Lina.
We can also display confusion matrices for our 9 configurations:
outputs_individual$averaged_confusion %>% get_confusion_matrix_9_configurations()The patterns of correct and incorrect classifications seem to be the same in all 9 configurations. There does not seem to be a differential pattern in terms of improvement for individuals with more or less calls. To further investigate, we can draw side by side the two confusion matrices for the best (xgboost - MFCC) and worst (svm - DCT) configurations:
p_xgboost_MFCC <- outputs_individual$averaged_confusion %>% get_confusion_matrix("xgboost", "MFCC", display_raw_numbers = F)
p_svm_DCT <- outputs_individual$averaged_confusion %>% get_confusion_matrix("svm", "DCT", display_raw_numbers = F)
grid.arrange(p_xgboost_MFCC, p_svm_DCT, ncol = 2)
We can see that all the individuals are better predicted with the better
configuration (when we could have observed small violations to this
observation for specific individuals). The better system therefore seems
to be uniformly better.
For the Bioacoustic and DCT feature sets, we can look at the importance of the various features for different classifying algorithms. We leave MFCC aside as there are too many features which are not really meaningful when considered in isolation.
We first look at the DCT set:
p1 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("svm", "DCT")
p2 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
p3 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("nn", "DCT")
grid.arrange(p1, p2, p3, ncol = 3)
The 3 algorithms all suggest that dct0 is the feature
playing the most important role. dct1,
dct2, dct4 and
duration come next, with different rankings depending
on the classifying algorithm and quite close to each other altogether.
vocalization.HNR and dct3 are a bit
weaker in all three approaches.
We then look at the Bioacoustic set:
p1 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("svm", "Bioacoustic")
p2 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p3 <- outputs_individual$averaged_fimp %>% get_fimp_barplot("nn", "Bioacoustic")
grid.arrange(p1, p2, p3, ncol = 3)We observe that duration is consistently the most important feature for prediction of the individual. vocalization.HNR also plays an important role, and other parameters like q1t also seems to be a bit more important than the others, although it is not very clear.
It is interesting to note that while duration is the most important feature for DCT, it is not for Bioacoustic.
p_a <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic", axis_font_size = 15, axis_title_font_size = 20, title_font_size = 24)
p_b <- outputs_individual$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT", axis_font_size = 15, axis_title_font_size = 20, title_font_size = 24)
grid.arrange(p_a, p_b, ncol = 2)We can look at the performances of our different ensemble models, and compare them to those of our initial / primary classifiers. We first focus on log loss, since it is the metric which was used during the tuning of the hyper-parameters of the different classifiers:
outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("log loss")
Several points can be highlighted:
We can compute and display the increase in performance with respect to the worst configuration (svm and DCT):
outputs_individual$averaged_performances %>% filter(measure_name == "log loss") %>%
mutate(improvement_percentage = round(100 * (max(mean) - mean) / max(mean), 2)) %>%
ggplot(aes(x = reorder(configuration, improvement_percentage), y = improvement_percentage, fill = classifier_type)) +
geom_bar(stat = "identity", col = "black", position=position_dodge()) +
geom_text(aes(label = round(improvement_percentage, 1)), hjust = 1.2, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
labs(fill = "Classifier type") +
theme_minimal() + coord_flip() + ylab("log loss") + xlab("Feature set") + ggtitle("Comparison of the performances with the logloss metric")
The increase in performance is very clear between the worst and best
configurations - 23.0%.
We can further look at other metrics:
outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("AUC")outputs_individual$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("balanced accuracy")The AUC values do not reveal the improvement in performances as clearly as other metrics, since the values are all very high - in other words, there is little ‘differentiation power’. However, there is no result contradicting what was observed with logloss. The same is observed with balanced accuracy.
We can inspect the confusion matrix of the best stacked model:
p_best <- outputs_individual$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets", font_size = 4.5, axis_font_size = 15, axis_title_font_size = 20, title_font_size = 16)
p_bestWe can display the previous matrix side by side with two other confusion matrices shown previously to better understand how the performances are improved:
p_best_comp <- outputs_individual$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets")
grid.arrange(p_best_comp, p_xgboost_MFCC, p_svm_DCT, ncol = 2, nrow = 2)We can see that the performances with the best stacked model are improved for all the individuals, without any obvious differential effect.
We first load the results of the 1,000 repetitions with nested shuffling / random permutation (inside types), and create different data tables…
outputs_individual_shuffled <- load_and_prepare_data("Result files/results - individual - shuffled.RData", decreasing_order_nb_calls_by_individual)We draw the histogram of the values of a metric with the permutation approach, as well as a vertical line representing the mean value for the initial, non-permuted, approach.
For logloss:
perfs_mean_tmp <- outputs_individual$averaged_performances %>%
filter(algo != "ensemble", measure_name == "log loss") %>%
select(algo, set, mean)
perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "log loss")
outputs_individual_shuffled$performances %>%
filter(measure_name == "log loss") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("log loss") +
labs(title = "Histograms of the distribution of log loss values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)
The histograms do not overlap at all, and are very distant from each
other (and the red line representing the mean of the performances
without shuffling is very clearly left of the ‘shuffled’ histogram).
This means that no logloss value obtained with a random
permutation is ever lower than the value obtained with our data. We can
therefore reject the null hypothesis of our prediction not being
different from chance prediction with a p_value lower than
0.001.
For AUC:
perfs_mean_tmp <- outputs_individual$averaged_performances %>%
filter(algo != "ensemble", measure_name == "AUC") %>%
select(algo, set, mean)
perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "AUC")
outputs_individual_shuffled$performances %>%
filter(measure_name == "AUC") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("AUC") +
labs(title = "Histograms of the distribution of AUC values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)
The situation is symmetrical to the previous one, which makes sense
since better performances mean lower logloss but higher AUC. No
AUC value obtained with a random permutation is ever
higher than the values obtained with our data. We can therefore reject
the null hypothesis of our prediction not being different from chance
prediction with a p_value lower than 0.001.
We reach the same conclusions with the balanced accuracy metric:
perfs_mean_tmp <- outputs_individual$averaged_performances %>%
filter(algo != "ensemble", measure_name == "balanced accuracy") %>%
select(algo, set, mean)
perfs_tmp <- outputs_individual$performances %>% filter(algo != "ensemble", measure_name == "balanced accuracy")
outputs_individual_shuffled$performances %>%
filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("AUC") +
labs(title = "Histograms of the distribution of balanced accuracy values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)We can therefore very confidently conclude that our primary classifiers, svm, nn and xgboost all predict well above chance.
What is the average balanced accuracy for the 1,000 repetitions with shuffling?
outputs_individual_shuffled$averaged_performances %>%
filter(measure_name == "balanced accuracy") %>%
select(-measure_name, -configuration, -classifier_type, -sd) %>%
kable(align = 'llr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))| algo | set | mean |
|---|---|---|
| nn | Bioacoustic | 0.1202695 |
| nn | DCT | 0.1202674 |
| nn | MFCC | 0.1153475 |
| svm | Bioacoustic | 0.1052773 |
| svm | DCT | 0.1064931 |
| svm | MFCC | 0.1021732 |
| xgboost | Bioacoustic | 0.1276787 |
| xgboost | DCT | 0.1268774 |
| xgboost | MFCC | 0.1213307 |
We find in all configurations an average balanced accuracy very close to 0.1, which might not be surprising given that we have 10 individuals (but see the results for call types).
baseline_individual <- outputs_individual_shuffled$averaged_performances %>%
filter(measure_name == "balanced accuracy") %>%
pull(mean) %>%
mean()Random baseline: 0.1161905
In order to prepare a table of results for the publication, we need to import earlier data obtained with the DFA…
algo_levels <- c("DFA", "svm", "nn", "xgboost", "ensemble")
set_levels <- c("Bioacoustic", "DCT", "MFCC",
"3 sets, svm", "3 sets, nn", "3 sets, xgboost",
"3 classifiers, Bioacoustic set", "3 classifiers, DCT set", "3 classifiers, MFCC set", "3 classifiers x 3 sets")
configs <- c(rep("", 3), paste0("(", 1:9, ")"), "(1+2+3)", "(4+5+6)", "(7+8+9)", "(1+4+7)", "(2+5+8)", "(3+6+9)", "(1+2+3+4+5+6+7+8+9)")
res_DFA <- readRDS("Result files/dfa-permuted-resultats01.rds")
DFA_Bio_logloss <- res_DFA$individual$Bioacoustic %>% pull(Log.Loss)
DFA_Bio_auc <- res_DFA$individual$Bioacoustic %>% pull(AUC)
DFA_Bio_bac <- res_DFA$individual$Bioacoustic %>% pull(Bal.Acc)
DFA_Bio_acc <- res_DFA$individual$Bioacoustic %>% pull(N.Correct.Test) / res_DFA$individual$Bioacoustic %>% pull(N.Test)
DFA_DCT_logloss <- res_DFA$individual$DCT %>% pull(Log.Loss)
DFA_DCT_auc <- res_DFA$individual$DCT %>% pull(AUC)
DFA_DCT_bac <- res_DFA$individual$DCT %>% pull(Bal.Acc)
DFA_DCT_acc <- res_DFA$individual$DCT %>% pull(N.Correct.Test) / res_DFA$individual$DCT %>% pull(N.Test)
DFA_MFCC_logloss <- res_DFA$individual$MFCC %>% pull(Log.Loss)
DFA_MFCC_auc <- res_DFA$individual$MFCC %>% pull(AUC)
DFA_MFCC_bac <- res_DFA$individual$MFCC %>% pull(Bal.Acc)
DFA_MFCC_acc <- res_DFA$individual$MFCC %>% pull(N.Correct.Test) / res_DFA$individual$MFCC %>% pull(N.Test)We create and display a comprehensive table of results…
output_df <- outputs_individual$averaged_performances %>%
select(-sd, -classifier_type, -configuration) %>%
pivot_wider(names_from = measure_name, values_from = mean) %>%
select(-Kappa, -`mean misclassification error`) %>%
add_row(algo = "DFA", set = "Bioacoustic", accuracy = DFA_Bio_acc, AUC = DFA_Bio_auc, `balanced accuracy` = DFA_Bio_bac, `log loss` = DFA_Bio_logloss) %>%
add_row(algo = "DFA", set = "DCT", accuracy = DFA_DCT_acc, AUC = DFA_DCT_auc, `balanced accuracy` = DFA_DCT_bac, `log loss` = DFA_DCT_logloss) %>%
add_row(algo = "DFA", set = "MFCC", accuracy = DFA_MFCC_acc, AUC = DFA_MFCC_auc, `balanced accuracy` = DFA_MFCC_bac, `log loss` = DFA_MFCC_logloss) %>%
mutate(algo = factor(algo, algo_levels), set = factor(set, set_levels)) %>%
arrange(algo, set) %>%
mutate(Configuration = configs) %>%
select(algo, set, Configuration, `log loss`, AUC, accuracy, `balanced accuracy`) %>%
mutate_if(is.numeric, round, digits = 3) %>%
rename(Algorithm = "algo", `Feature set` = "set")
output_df %>% kable(align = 'llccrrrrr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))| Algorithm | Feature set | Configuration | log loss | AUC | accuracy | balanced accuracy |
|---|---|---|---|---|---|---|
| DFA | Bioacoustic | 1.745 | 0.731 | 0.410 | 0.236 | |
| DFA | DCT | 1.698 | 0.731 | 0.429 | 0.235 | |
| DFA | MFCC | 1.785 | 0.732 | 0.457 | 0.293 | |
| svm | Bioacoustic |
|
1.490 | 0.840 | 0.516 | 0.392 |
| svm | DCT |
|
1.567 | 0.822 | 0.486 | 0.364 |
| svm | MFCC |
|
1.416 | 0.855 | 0.548 | 0.447 |
| nn | Bioacoustic |
|
1.487 | 0.839 | 0.508 | 0.381 |
| nn | DCT |
|
1.537 | 0.829 | 0.490 | 0.370 |
| nn | MFCC |
|
1.415 | 0.855 | 0.536 | 0.414 |
| xgboost | Bioacoustic |
|
1.415 | 0.850 | 0.544 | 0.447 |
| xgboost | DCT |
|
1.530 | 0.827 | 0.493 | 0.401 |
| xgboost | MFCC |
|
1.366 | 0.861 | 0.552 | 0.449 |
| ensemble | 3 sets, svm | (1+2+3) | 1.240 | 0.887 | 0.594 | 0.495 |
| ensemble | 3 sets, nn | (4+5+6) | 1.264 | 0.886 | 0.583 | 0.466 |
| ensemble | 3 sets, xgboost | (7+8+9) | 1.254 | 0.885 | 0.589 | 0.484 |
| ensemble | 3 classifiers, Bioacoustic set | (1+4+7) | 1.420 | 0.851 | 0.542 | 0.427 |
| ensemble | 3 classifiers, DCT set | (2+5+8) | 1.493 | 0.835 | 0.500 | 0.389 |
| ensemble | 3 classifiers, MFCC set | (3+6+9) | 1.311 | 0.873 | 0.571 | 0.465 |
| ensemble | 3 classifiers x 3 sets | (1+2+3+4+5+6+7+8+9) | 1.210 | 0.894 | 0.605 | 0.507 |
We further save the table of results as a text file…
write.table(output_df, file = "Result files/performances - individual.txt", sep = "\t", dec = ".", row.names = F, quote = F)We create a new barplot which contains additional bar for the DFA. The metric is balanced accuracy, and we add a vertical line for the average baseline performance…
res_DFA <- readRDS("Result files/dfa-permuted-resultats01.rds")
DFA_Bio_bac_m <- res_DFA$individual$Bioacoustic %>% pull(Bal.Acc)
DFA_Bio_bac_sd <- res_DFA$individual$Bioacoustic %>% pull(sd.Bal.Acc)
DFA_DCT_bac_m <- res_DFA$individual$DCT %>% pull(Bal.Acc)
DFA_DCT_bac_sd <- res_DFA$individual$DCT %>% pull(sd.Bal.Acc)
DFA_MFCC_bac_m <- res_DFA$individual$MFCC %>% pull(Bal.Acc)
DFA_MFCC_bac_sd <- res_DFA$individual$MFCC %>% pull(sd.Bal.Acc)
outputs_individual$averaged_performances %>%
add_row(algo = "DFA", set = "Bioacoustic", measure_name = "balanced accuracy", configuration = "DFA - Bioacoustic", classifier_type = "single classifier", mean = DFA_Bio_bac_m, sd = DFA_Bio_bac_sd) %>%
add_row(algo = "DFA", set = "DCT", measure_name = "balanced accuracy", configuration = "DFA - DCT", classifier_type = "single classifier", mean = DFA_DCT_bac_m, sd = DFA_DCT_bac_sd) %>%
add_row(algo = "DFA", set = "MFCC", measure_name = "balanced accuracy", configuration = "DFA - MFCC", classifier_type = "single classifier", mean = DFA_MFCC_bac_m, sd = DFA_MFCC_bac_sd) %>%
filter(algo != "ensemble" | configuration == "ensemble - 3 classifiers x 3 sets") %>%
get_barplots_comparison_all_configurations_by_set("balanced accuracy") +
geom_segment(x = -0.5, y = baseline_individual, xend = 14.0, yend = baseline_individual, colour = "red", lineend = "round", size = 0.5, linetype = "dashed")We first load the results of the assessments of the different models, and create different data tables…
outputs_type <- load_and_prepare_data("Result files/results - type - default.RData", decreasing_order_nb_calls_by_type)We have conducted 100 repetitions of the same sequence: i) create a training-test partition randomly, ii) train a model with the training set, iii) assess the model with the test set. We can look at the distribution of values of the different metrics across the repetitions.
For logloss:
outputs_type$performances %>% get_histograms_9_configurations("log loss")
The distributions are similar in shape to those observed with
individual. More repetitions would likely lead to
smooth and symmetrical distributions, possibly Gaussian. Once again, the
range of variation of the logloss values is quite
significant in each configuration, which suggests it was a good idea to
consider a number of repetitions.
We reach the same conclusions with multi-class AUC, balanced accuracy and accuracy:
outputs_type$performances %>% get_histograms_9_configurations("AUC")outputs_type$performances %>% get_histograms_9_configurations("balanced accuracy")outputs_type$performances %>% get_histograms_9_configurations("accuracy")We first focus on logloss, which is the metric which was used for the tuning of the hyper-parameters of our different classifiers. We display the average performances for each classifier and each set of predictors:
outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("log loss")We can make a number of observations which differ from those we could make with individual:
We can confirm these results with two other metrics: multi-class AUC and balanced accuracy…
p1 <- outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("AUC")
p2 <- outputs_type$averaged_performances %>% get_barplots_comparison_9_configurations("balanced accuracy")
grid.arrange(p1, p2, ncol = 2)While AUC gives nearly the same ranking between the 9 conditions as log loss, there are more differences with balanced accuracy. We can, however, conclude that whatever the metric, xgboost combined with Bioacoustic offers the best performances. MFCC parameters, on the contrary, lead consistently to the worst performances.
We do not observe any specific pattern for the standard deviations.
df_type %>% mutate(n = text_spec(n, color = "white", align = "r", background = spec_color(n, option = "D", scale_from = c(0, 400)))) %>%
kbl(digits = 3, escape = F) %>%
kable_classic() %>%
kable_styling(full_width = F)| Type | n |
|---|---|
| PY | 443 |
| SB | 382 |
| B | 274 |
| P | 255 |
| SCB | 206 |
We can display the confusion matrix for our best case scenario, when xgboost is considered with Bioacoustic:
outputs_type$averaged_confusion %>% get_confusion_matrix("xgboost", "Bioacoustic", display_raw_numbers = T)
The performances are clearly better than with individuals.
P is classified extremely well (93% accuracy). We can
observe some confusion between B, SB
and SCB, and very little confusion between
Y and PY. There is no correlation
between the number of calls per call type and the by_type classification
accuracy.
We can also display confusion matrices for our 9 configurations:
outputs_type$averaged_confusion %>% get_confusion_matrix_9_configurations()The patterns of correct and incorrect classifications seem to be the same in all 9 configurations. There does not seem to be a differential in terms of improvement for types with more or less calls. To further investigate, we can draw side by side the two confusion matrices for the best (xgboost - Bioacoustic) and worst (nn - MFCC) configurations:
p_xgboost_Bioacoustic <- outputs_type$averaged_confusion %>% get_confusion_matrix("xgboost", "Bioacoustic", display_raw_numbers = T)
p_nn_MFCC <- outputs_type$averaged_confusion %>% get_confusion_matrix("nn", "MFCC", display_raw_numbers = T)
grid.arrange(p_xgboost_Bioacoustic, p_nn_MFCC, ncol = 2)
We can see that P is especially impacted by choosing
the weaker configuration - it is way more often confused with
PY. The confusion between B,
SB and SCB is also greater.
For each set of predictors, we can look at the importance of the various features for different classifying algorithms.
We first look at the DCT set:
p1 <- outputs_type$averaged_fimp %>% get_fimp_barplot("svm", "DCT")
p2 <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT")
p3 <- outputs_type$averaged_fimp %>% get_fimp_barplot("nn", "DCT")
grid.arrange(p1, p2, p3, ncol = 3)The 3 algorithms strongly suggest that duration and dct2 playing by far the most important roles. dct4 comes next, ahead of other predictors.
We then look at the Bioacoustic set:
p1 <- outputs_type$averaged_fimp %>% get_fimp_barplot("svm", "Bioacoustic")
p2 <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic")
p3 <- outputs_type$averaged_fimp %>% get_fimp_barplot("nn", "Bioacoustic")
grid.arrange(p1, p2, p3, ncol = 3)We observe that duration is consistently and by far the most important feature for prediction of the type. f0_onset, f0_offset, f0.slope.asc and f0.slop.desc come next.
p_a <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "Bioacoustic", axis_font_size = 15, axis_title_font_size = 20, title_font_size = 24)
p_b <- outputs_type$averaged_fimp %>% get_fimp_barplot("xgboost", "DCT", axis_font_size = 15, axis_title_font_size = 20, title_font_size = 24)
grid.arrange(p_a, p_b, ncol = 2)We can look at the performances of our different ensemble models, and compare them to those of our initial / primary classifiers. We first focus on logloss, since it is the metric which was used during the tuning of the hyper-parameters of the different classifiers:
outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("log loss")Several points can be highlighted:
We can compute and display the increase in performance with respect to the worst configuration (nn and MFCC):
outputs_type$averaged_performances %>%
filter(measure_name == "log loss") %>%
mutate(improvement_percentage = round(100 * (max(mean) - mean) / max(mean), 2)) %>%
ggplot(aes(x = reorder(configuration, improvement_percentage), y = improvement_percentage, fill = classifier_type)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_text(aes(label = round(improvement_percentage, 1)), hjust = 1.2, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
labs(fill = "Classifying algorithm") +
theme_minimal() + coord_flip() + ylab("log loss") + xlab("Feature set") + ggtitle("Comparison of the performances with the log loss metric")
The best classifier is nearly 40% better than the worst one.
We can further look at other metrics:
outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("AUC")outputs_type$averaged_performances %>% get_barplots_comparison_all_configurations_by_classifier_type("balanced accuracy")AUC is very consistent with log loss. Balanced accuracy is also quite strongly consistent.
We can inspect the confusion matrix of the best stacked model:
p_best <- outputs_type$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets", font_size = 7, axis_font_size = 18, axis_title_font_size = 20, title_font_size = 18)
p_bestWe can further display the previous matrix side by side with two other confusion matrices shown previously to better understand how the performances are improved:
p_best_comp <- outputs_type$averaged_confusion %>% get_confusion_matrix("ensemble", "3 classifiers x 3 sets", display_raw_numbers = F, title = "Confusion matrix - Ensemble 3 classifiers x 3 sets")
grid.arrange(p_best_comp, p_xgboost_Bioacoustic, p_nn_MFCC, ncol = 2, nrow = 2)We can see that the performances with the best stacked model are improved for all the types but P compared to the best non-ensemble learner.
We first load the results of the 1,000 repetitions with nested shuffling / random permutation (inside individuals), and create different data tables…
outputs_type_shuffled <- load_and_prepare_data("Result files/results - type - shuffled.RData", decreasing_order_nb_calls_by_type)We draw the histogram of the values of a metric with the permutation approach, as well as a vertical line representing the mean value for the initial, non-permuted, approach.
For logloss:
perfs_mean_tmp <- outputs_type$averaged_performances %>%
filter(algo != "ensemble", measure_name == "log loss") %>%
select(algo, set, mean)
perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "log loss")
outputs_type_shuffled$performances %>%
filter(measure_name == "log loss") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("log loss") +
labs(title = "Histograms of the distribution of log loss values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)The histograms do not overlap at all, and are very distant from each other (and the red line representing the mean of the performances without shuffling is very clearly left of the ‘shuffled’ histogram). This means that no log loss value obtained with a random permutation is ever lower than the value obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001.
For AUC:
perfs_mean_tmp <- outputs_type$averaged_performances %>%
filter(algo != "ensemble", measure_name == "AUC") %>%
select(algo, set, mean)
perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "AUC")
outputs_type_shuffled$performances %>%
filter(measure_name == "AUC") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("AUC") +
labs(title = "Histograms of the distribution of AUC values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)We reach the same conclusions with the balanced accuracy metric:
perfs_mean_tmp <- outputs_type$averaged_performances %>%
filter(algo != "ensemble", measure_name == "balanced accuracy") %>%
select(algo, set, mean)
perfs_tmp <- outputs_type$performances %>% filter(algo != "ensemble", measure_name == "balanced accuracy")
outputs_type_shuffled$performances %>%
filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = measure)) +
geom_histogram(bins = 80) +
facet_grid(algo ~ set, scale = "free") +
xlab("AUC") +
labs(title = "Histograms of the distribution of balanced accuracy values", subtitle = "Different classifiers and feature sets") +
geom_histogram(data = perfs_tmp, aes(x = measure), fill = "pink", bins = 40) +
geom_segment(data = perfs_mean_tmp, aes(x = mean, y = 0, xend = mean, yend = 100), colour = "red", size = 1)The situation is symmetrical to the previous one, which makes sense since better performances mean lower logloss but higher AUC. No AUC value obtained with a random permutation is ever higher than the values obtained with our data. We can therefore reject the null hypothesis of our prediction not being different from chance prediction with a p_value lower than 0.001. We reach the same conclusions with the balanced accuracy metric.
We can therefore very confidently conclude that our primary classifiers, svm, nn and xgboost all predict well above chance.
What is the average balanced accuracy for the 1,000 repetitions with shuffling?
outputs_type_shuffled$averaged_performances %>%
filter(measure_name == "balanced accuracy") %>%
select(-measure_name, -configuration, -classifier_type, -sd) %>%
kable(align = 'llr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))| algo | set | mean |
|---|---|---|
| nn | Bioacoustic | 0.2367736 |
| nn | DCT | 0.2365578 |
| nn | MFCC | 0.2360988 |
| svm | Bioacoustic | 0.2292523 |
| svm | DCT | 0.2314043 |
| svm | MFCC | 0.2231728 |
| xgboost | Bioacoustic | 0.2439316 |
| xgboost | DCT | 0.2387587 |
| xgboost | MFCC | 0.2373608 |
We find in all configurations an average balanced accuracy varying between 0.229 and 0.244, while we could have expected 0.20 given that we have 5 call types. This shows that adopting a nested random shuffling rather than a full random shuffling is a more rigorous and conservative approach. For further investigations, we can consider as the random baseline the average value of the average balanced accuracies:
baseline_type <- outputs_type_shuffled$averaged_performances %>%
filter(measure_name == "balanced accuracy") %>%
pull(mean) %>%
mean()Random baseline: 0.2348123
In order to prepare a table of results for the publication, we need to import earlier data obtained with the DFA…
algo_levels <- c("DFA", "svm", "nn", "xgboost", "ensemble")
set_levels <- c("Bioacoustic", "DCT", "MFCC",
"3 sets, svm", "3 sets, nn", "3 sets, xgboost",
"3 classifiers, Bioacoustic set", "3 classifiers, DCT set", "3 classifiers, MFCC set", "3 classifiers x 3 sets")
configs <- c(rep("", 3), paste0("(", 1:9, ")"), "(1+2+3)", "(4+5+6)", "(7+8+9)", "(1+4+7)", "(2+5+8)", "(3+6+9)", "(1+2+3+4+5+6+7+8+9)")
res_DFA <- readRDS("Result files/dfa-permuted-resultats01.rds")
DFA_Bio_logloss <- res_DFA$type$Bioacoustic %>% pull(Log.Loss)
DFA_Bio_auc <- res_DFA$type$Bioacoustic %>% pull(AUC)
DFA_Bio_bac <- res_DFA$type$Bioacoustic %>% pull(Bal.Acc)
DFA_Bio_acc <- res_DFA$type$Bioacoustic %>% pull(N.Correct.Test) / res_DFA$type$Bioacoustic %>% pull(N.Test)
DFA_DCT_logloss <- res_DFA$type$DCT %>% pull(Log.Loss)
DFA_DCT_auc <- res_DFA$type$DCT %>% pull(AUC)
DFA_DCT_bac <- res_DFA$type$DCT %>% pull(Bal.Acc)
DFA_DCT_acc <- res_DFA$type$DCT %>% pull(N.Correct.Test) / res_DFA$type$DCT %>% pull(N.Test)
DFA_MFCC_logloss <- res_DFA$type$MFCC %>% pull(Log.Loss)
DFA_MFCC_auc <- res_DFA$type$MFCC %>% pull(AUC)
DFA_MFCC_bac <- res_DFA$type$MFCC %>% pull(Bal.Acc)
DFA_MFCC_acc <- res_DFA$type$MFCC %>% pull(N.Correct.Test) / res_DFA$type$MFCC %>% pull(N.Test)We create and display a comprehensive table of results…
output_df <- outputs_type$averaged_performances %>%
select(-sd, -classifier_type, -configuration) %>%
pivot_wider(names_from = measure_name, values_from = mean) %>%
select(-Kappa, -`mean misclassification error`) %>%
add_row(algo = "DFA", set = "Bioacoustic", accuracy = DFA_Bio_acc, AUC = DFA_Bio_auc, `balanced accuracy` = DFA_Bio_bac, `log loss` = DFA_Bio_logloss) %>%
add_row(algo = "DFA", set = "DCT", accuracy = DFA_DCT_acc, AUC = DFA_DCT_auc, `balanced accuracy` = DFA_DCT_bac, `log loss` = DFA_DCT_logloss) %>%
add_row(algo = "DFA", set = "MFCC", accuracy = DFA_MFCC_acc, AUC = DFA_MFCC_auc, `balanced accuracy` = DFA_MFCC_bac, `log loss` = DFA_MFCC_logloss) %>%
mutate(algo = factor(algo, algo_levels), set = factor(set, set_levels)) %>%
arrange(algo, set) %>%
mutate(Configuration = configs) %>%
select(algo, set, Configuration, `log loss`, AUC, accuracy, `balanced accuracy`) %>%
mutate_if(is.numeric, round, digits = 3) %>%
rename(Algorithm = "algo", `Feature set` = "set")
output_df %>% kable(align = 'llccrrrrr') %>% kable_classic() %>% kable_styling(full_width = T, bootstrap_options = c("striped"))| Algorithm | Feature set | Configuration | log loss | AUC | accuracy | balanced accuracy |
|---|---|---|---|---|---|---|
| DFA | Bioacoustic | 0.819 | 0.894 | 0.651 | 0.588 | |
| DFA | DCT | 0.813 | 0.901 | 0.674 | 0.596 | |
| DFA | MFCC | 1.310 | 0.779 | 0.491 | 0.437 | |
| svm | Bioacoustic |
|
0.671 | 0.931 | 0.736 | 0.718 |
| svm | DCT |
|
0.658 | 0.934 | 0.745 | 0.732 |
| svm | MFCC |
|
0.810 | 0.906 | 0.664 | 0.672 |
| nn | Bioacoustic |
|
0.674 | 0.933 | 0.728 | 0.702 |
| nn | DCT |
|
0.681 | 0.932 | 0.724 | 0.701 |
| nn | MFCC |
|
0.890 | 0.888 | 0.621 | 0.625 |
| xgboost | Bioacoustic |
|
0.627 | 0.940 | 0.755 | 0.747 |
| xgboost | DCT |
|
0.651 | 0.936 | 0.732 | 0.720 |
| xgboost | MFCC |
|
0.830 | 0.901 | 0.654 | 0.659 |
| ensemble | 3 sets, svm | (1+2+3) | 0.581 | 0.952 | 0.785 | 0.784 |
| ensemble | 3 sets, nn | (4+5+6) | 0.604 | 0.950 | 0.773 | 0.767 |
| ensemble | 3 sets, xgboost | (7+8+9) | 0.578 | 0.952 | 0.785 | 0.784 |
| ensemble | 3 classifiers, Bioacoustic set | (1+4+7) | 0.640 | 0.939 | 0.759 | 0.744 |
| ensemble | 3 classifiers, DCT set | (2+5+8) | 0.648 | 0.938 | 0.752 | 0.736 |
| ensemble | 3 classifiers, MFCC set | (3+6+9) | 0.792 | 0.912 | 0.675 | 0.683 |
| ensemble | 3 classifiers x 3 sets | (1+2+3+4+5+6+7+8+9) | 0.542 | 0.957 | 0.794 | 0.794 |
We further save the table of results as a text file…
write.table(output_df, file = "Result files/performances - type.txt", sep = "\t", dec = ".", row.names = F, quote = F)Another representation for balanced accuracy with added rows for the DFA:
res_DFA <- readRDS("Result files/dfa-permuted-resultats01.rds")
DFA_Bio_bac_m <- res_DFA$type$Bioacoustic %>% pull(Bal.Acc)
DFA_Bio_bac_sd <- res_DFA$type$Bioacoustic %>% pull(sd.Bal.Acc)
DFA_DCT_bac_m <- res_DFA$type$DCT %>% pull(Bal.Acc)
DFA_DCT_bac_sd <- res_DFA$type$DCT %>% pull(sd.Bal.Acc)
DFA_MFCC_bac_m <- res_DFA$type$MFCC %>% pull(Bal.Acc)
DFA_MFCC_bac_sd <- res_DFA$type$MFCC %>% pull(sd.Bal.Acc)
outputs_type$averaged_performances %>%
add_row(algo = "DFA", set = "Bioacoustic", measure_name = "balanced accuracy", configuration = "DFA - Bioacoustic", classifier_type = "single classifier", mean = DFA_Bio_bac_m, sd = DFA_Bio_bac_sd) %>%
add_row(algo = "DFA", set = "DCT", measure_name = "balanced accuracy", configuration = "DFA - DCT", classifier_type = "single classifier", mean = DFA_DCT_bac_m, sd = DFA_DCT_bac_sd) %>%
add_row(algo = "DFA", set = "MFCC", measure_name = "balanced accuracy", configuration = "DFA - MFCC", classifier_type = "single classifier", mean = DFA_MFCC_bac_m, sd = DFA_MFCC_bac_sd) %>%
filter(algo != "ensemble" | configuration == "ensemble - 3 classifiers x 3 sets") %>%
get_barplots_comparison_all_configurations_by_set("balanced accuracy") +
geom_segment(x = -0.5, y = baseline_type, xend = 14.0, yend = baseline_type, colour = "red", lineend = "round", size = 0.5, linetype = "dashed")We focus here on a specific case of possible data leakage, which relates to the non-independence of calls due to their organization in sequences. We want to compare classification performances when i) we do not do anything regarding how sequences overlap the training and test sets (default strategy), ii) we minimize how sequences overlap the training and test sets (fair strategy) (i.e., the calls of a sequence tend to be either all in the training set of all in the test set), and iii) we maximize how sequences overlap the training and test sets (skewed strategy) (i.e., the calls of a sequence as distributed as equally as possible between the training set and the test set).
We first load the results of the assessments of the different configurations, and create different data tables…
outputs_individual_default <- load_and_prepare_data("Result files/results - individual - default.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_skewed <- load_and_prepare_data("Result files/results - individual - skewed.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_fair <- load_and_prepare_data("Result files/results - individual - fair.RData", decreasing_order_nb_calls_by_individual)We define a function combine_tables(), which combines different tables of data corresponding to different strategies (default, fair, skewed) into a single table…
combine_tables <- function(dt_default, dt_skewed, dt_fair) {
dt_default <- dt_default %>% mutate(sequence_optim = "Default")
dt_skewed <- dt_skewed %>% mutate(sequence_optim = "Skewed")
dt_fair <- dt_fair %>% mutate(sequence_optim = "Fair")
dt_all <- dt_default %>%
rbind(dt_skewed) %>%
rbind(dt_fair)
return (dt_all)
}We now call the previous function for the two tables corresponding to classification performances and feature importance…
performances_individual_all <-
combine_tables(outputs_individual_default$performances, outputs_individual_skewed$performances, outputs_individual_fair$performances) %>%
filter(algo != "ensemble")
averaged_fimp_individual_all <-
combine_tables(outputs_individual_default$averaged_fimp, outputs_individual_skewed$averaged_fimp, outputs_individual_fair$averaged_fimp) %>%
filter(algo != "ensemble")We finally need to combine the data regarding the quality/score of the three strategies - how much overlap there is…
sequence_scores_default <- tibble(sequence_score = outputs_individual_default$sequence_scores, sequence_optim = "Default")
sequence_scores_skewed <- tibble(sequence_score = outputs_individual_skewed$sequence_scores, sequence_optim = "Skewed")
sequence_scores_fair <- tibble(sequence_score = outputs_individual_fair$sequence_scores, sequence_optim = "Fair")
sequence_scores <- rbind(sequence_scores_default, sequence_scores_skewed, sequence_scores_fair)We can display, for each strategy, how far the pairs of sets are from an optimum in terms of sequence distribution, i.e., a case where no sequence is distributed across the two sets:
hist_distance_to_optimum <- sequence_scores %>% ggplot(aes(x = sequence_score, fill = sequence_optim)) +
geom_histogram(bins = 220) +
scale_fill_manual(values=c("#f9cb9c", "#b6d7a8", "#980000")) +
xlab("Distance to the optimum") + labs(fill = "Strategy") +
xlim(0, 350) +
labs(title = "Distance to the optimum in terms of sequence overlapping", subtitle = "Histogram for 100 runs")
hist_distance_to_optimumWe can clearly see that the fair strategy has the least sequence overlapping, and the skewed strategy has the most, as expected. The default strategy is closer to the latter in terms of distance to the optimum, which may suggest that doing nothing might not be such a good idea.
We wish to know whether sequences overlapping the training and test sets lead to higher classification performances because the classifiers take advantage of the resemblances between calls of the same sequence. Conversely, we wish to know whether restricting sequences to a single set - training or test - as much as possible leads to worse performances.
We display the distribution of performances as assessed with the log loss metric, highlighting the differences between the ‘fair’, ‘skewed’ and ‘default’ conditions:
performances_individual_all %>%
filter(measure_name == "log loss") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("log loss") +
labs(title = "Kernel density estimates - log loss values", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")
We observe that the performances for ‘default’ and ‘fair’ are quite
close, although the distribution for ‘fair’ seems to be slightly more on
the right, i.e. correspond to higher values of log loss
and therefore to worse performances. The distribution for ‘skewed’
clearly correspond to lower values of log loss, i.e. to
better performances. This supports our aforementioned hypothesis,
although weakly.
We can repeat the same approach with balanced accuracy and AUC as the measures of performance:
p_perf_hist <- performances_individual_all %>%
filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("Balanced accuracy") +
labs(title = "Kernel density estimates - balanced accuracy", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")
p_perf_histperformances_individual_all %>%
filter(measure_name == "AUC") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("AUC") +
labs(title = "Kernel density estimates - AUC values", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Strategy")What we observe leads to exactly the same conclusion as with log loss.
We compute the average performances for all the different configurations…
mean_performances_individual_all <- performances_individual_all %>%
group_by(sequence_optim, algo, set, measure_name) %>%
summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))We display the average performances for the different sets of predictors and classifiers so that we can highlight the differences between our different strategies for the sequences of calls:
mean_performances_individual_all %>% filter(measure_name == "log loss") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("log loss") + xlab("Strategies") +
labs(title = "Average performances - log loss", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
facet_grid(algo ~ set)
We can see clearly that the performances drop with the ‘fair’ strategy,
although the drop is not very large. The difference is much smaller
between ‘default’ and ‘fair’.
We can confirm our observations with AUC and balanced accuracy:
mean_performances_individual_all %>% filter(measure_name == "AUC") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("AUC") + xlab("Strategies") +
labs(title = "Average performances - AUC", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifying algorithm") +
facet_grid(algo ~ set)p_perf_mean <- mean_performances_individual_all %>% filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("Balanced accuracy") + xlab("Strategies") +
labs(title = "Average performances - balanced accuracy", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
facet_grid(algo ~ set)
plot(p_perf_mean)
We get exactly the same results.
We can conclude that paying attention to the sequences does not deeply modify the results we get when it comes to performances. We may wonder, however, whether the importance of the different predictors is impacted more significantly.
We create a figure for the publication…
p_both <- grid.arrange(hist_distance_to_optimum, p_perf_mean, ncol = 2)We further compute the average importance of the predictors for all all the different configurations…
averaged_fimp_individual_all <- averaged_fimp_individual_all %>% mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))We display the average importance for the different classifiers and the different predictors of the DCT set, so that we can highlight the differences between our different strategies for the sequences of calls:
averaged_fimp_individual_all %>%
filter(set == "DCT") %>% droplevels() %>%
ggplot(aes(x = sequence_optim, y = mean)) +
geom_bar(stat = "identity", col = "black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
xlab("Feature") + ylab("Importance") +
theme_minimal() + coord_flip() +
ggtitle("Importance of DCT coefficients") +
theme(plot.title = element_text(size = 10, face = "bold")) +
facet_grid(feature ~ algo)
We do not observe drastic changes between the three configurations.
We can check in a similar way the predictors of the Bioacoustic set:
averaged_fimp_individual_all %>%
filter(set == "Bioacoustic") %>% droplevels() %>%
ggplot(aes(x = sequence_optim, y = mean)) +
geom_bar(stat = "identity", col = "black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
xlab("Feature") + ylab("Importance") +
theme_minimal() + coord_flip() +
ggtitle("Importance of Bioacoustic coefficients") +
theme(plot.title = element_text(size = 10, face = "bold")) +
facet_grid(feature ~ algo)
Once again, the variation we observe between the three strategies is
quite limited.
One may wonder if the previous results, with a limited impact of the management strategy of the sequences of calls, could provide a biased perspective on the issue of data leakage because of the large number of sequences with only 1 or 2 calls in our dataset. During the curation phase, we thus created a second dataset which contains only calls appearing in sequences of at least 3 calls, and we can repeat the same procedure as above. We may thus offer a complementary view on the question of interest.
We first load the results of the assessments of the different configurations, and create different data tables…
outputs_individual_default <- load_and_prepare_data("Result files/results - individual - long sequences - default.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_skewed <- load_and_prepare_data("Result files/results - individual - long sequences - skewed.RData", decreasing_order_nb_calls_by_individual)
outputs_individual_fair <- load_and_prepare_data("Result files/results - individual - long sequences - fair.RData", decreasing_order_nb_calls_by_individual)We now call the function combine_tables() for the two tables corresponding to classification performances and feature importance…
performances_individual_all <-
combine_tables(outputs_individual_default$performances, outputs_individual_skewed$performances, outputs_individual_fair$performances) %>%
filter(algo != "ensemble")
averaged_fimp_individual_all <-
combine_tables(outputs_individual_default$averaged_fimp, outputs_individual_skewed$averaged_fimp, outputs_individual_fair$averaged_fimp) %>%
filter(algo != "ensemble")We combine the data regarding the quality/score of the three strategies…
sequence_scores_default <- tibble(sequence_score = outputs_individual_default$sequence_scores, sequence_optim = "default")
sequence_scores_skewed <- tibble(sequence_score = outputs_individual_skewed$sequence_scores, sequence_optim = "skewed")
sequence_scores_fair <- tibble(sequence_score = outputs_individual_fair$sequence_scores, sequence_optim = "fair")
sequence_scores <- rbind(sequence_scores_default, sequence_scores_skewed, sequence_scores_fair)We can display for the three strategies how far the pairs of sets are from an optimum in terms of sequence distribution:
hist_distance_to_optimum <- sequence_scores %>% ggplot(aes(x = sequence_score, fill = sequence_optim)) +
geom_histogram(bins = 220) +
scale_fill_manual(values=c("#f9cb9c", "#b6d7a8", "#980000")) +
xlab("Distance to the optimum") + labs(fill = "Strategy") +
xlim(0, 350) +
labs(title = "Distance to the optimum in terms of sequence overlapping (long sequences)", subtitle = "Histogram for 100 runs")
hist_distance_to_optimumIn comparison to the previous case where even very short sequences were considered, we observe that the default strategy is much closer to the skewed strategy. Optimizing the distribution of sequences across the training and set sets leads to significantly different distributions of the calls in these sets.
We display the distribution of performances as assessed with the log loss metric, highlighting the differences between the three strategies:
performances_individual_all %>%
filter(measure_name == "log loss") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("log loss") +
labs(fill = "Strategy") +
labs(title = "Kernel density estimates - log loss (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")
We observe a different pattern from the previous one: we see that the
default and skewed strategies are very close, while
the fair strategy leads to weaker performances.
We can repeat the same approach with balanced accuracy and AUC as the measures of performance:
p_perf_hist <- performances_individual_all %>%
filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("Balanced accuracy") +
labs(fill = "Strategy") +
labs(title = "Kernel density estimates - balanced accuracy (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")
plot(p_perf_hist)performances_individual_all %>%
filter(measure_name == "AUC") %>%
ggplot(aes(x = measure, fill = sequence_optim)) +
geom_density(alpha = 0.25) +
facet_grid(algo ~ set) +
xlab("AUC") +
labs(fill = "Strategy") +
labs(title = "Kernel density estimates - AUC (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies")
The patterns are similar to those we described with log
loss.
We compute the average performances for all the different configurations…
mean_performances_individual_all <- performances_individual_all %>%
group_by(sequence_optim, algo, set, measure_name) %>%
summarize(mean = mean(measure), sd = sd(measure), .groups = "drop") %>%
mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))We display the average performances for the different sets of predictors and classifiers so that we can highlight the differences between our different strategies for the sequences of calls:
mean_performances_individual_all %>% filter(measure_name == "log loss") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("log loss") + xlab("Strategies") +
labs(title = "Average performances - log loss (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
facet_grid(algo ~ set)
We can see clearly that the performances drop with the ‘fair’ strategy,
with a greater drop than when all calls were considered.
We can confirm our observations with AUC and balanced accuracy:
mean_performances_individual_all %>% filter(measure_name == "AUC") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 2.0, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("AUC") + xlab("Strategies") +
labs(title = "Average performances - AUC (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifying algorithm") +
facet_grid(algo ~ set)p_perf_mean <- mean_performances_individual_all %>% filter(measure_name == "balanced accuracy") %>%
ggplot(aes(x = sequence_optim, y = mean, fill = algo)) +
geom_bar(stat = "identity", col = "black", position = position_dodge()) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2, position = position_dodge(0.9)) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3.5, position = position_dodge(width = 0.9)) +
theme_minimal() + coord_flip() + ylab("Balanced accuracy") + xlab("Strategies") +
labs(title = "Average performances - balanced accuracy (long sequences)", subtitle = "Different classifiers, feature sets and overlapping stategies", fill = "Classifier") +
facet_grid(algo ~ set)
plot(p_perf_mean)In conclusion, if we focus on longer sequences, there is a greater impact of not accounting for the data leakage which results from sequences of calls appearing both in the training and test set. This should be a word of caution when dealing with other patterns of non-independence of the observations.
We create a figure for the publication…
p_both <- grid.arrange(hist_distance_to_optimum, p_perf_mean, ncol = 2)We further compute the average importance of the predictors for all all the different configurations…
averaged_fimp_individual_all <- averaged_fimp_individual_all %>% mutate(sequence_optim = fct_relevel(sequence_optim, c("Fair", "Default", "Skewed")))We display the average importance for the different classifiers and the different predictors of the DCT set, so that we can highlight the differences between our different strategies for the sequences of calls:
averaged_fimp_individual_all %>%
filter(set == "DCT") %>% droplevels() %>%
ggplot(aes(x = sequence_optim, y = mean)) +
geom_bar(stat = "identity", col = "black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
xlab("Feature") + ylab("Importance") +
theme_minimal() + coord_flip() +
ggtitle("Importance of DCT coefficients (long sequences only)") +
theme(plot.title = element_text(size = 10, face = "bold")) +
facet_grid(feature ~ algo)
We do not observe drastic changes between the three configurations.
We can check in a similar way the predictors of the Bioacoustic set:
averaged_fimp_individual_all %>%
filter(set == "Bioacoustic") %>% droplevels() %>%
ggplot(aes(x = sequence_optim, y = mean)) +
geom_bar(stat = "identity", col = "black") +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .2) +
geom_text(aes(label = round(mean, 3)), hjust = 1.5, color = "white", size = 3) +
xlab("Feature") + ylab("Importance") +
theme_minimal() + coord_flip() +
ggtitle("Importance of Bioacoustic coefficients (long sequences only)") +
theme(plot.title = element_text(size = 10, face = "bold")) +
facet_grid(feature ~ algo)
Once again, the variation we observe between the three strategies is
quite limited.
cat(utils::sessionInfo()$R.version$version.string, "<br />", sep = "")R version 4.1.3 (2022-03-10)
cat("Platform: ", utils::sessionInfo()$platform, "<br />", sep = "")Platform: x86_64-w64-mingw32/x64 (64-bit)
cat("OS version: ", utils::sessionInfo()$running, "<br />", sep = "")OS version: Windows 10 x64 (build 22000)
packages.base <- utils::sessionInfo()$basePkgs
packages.others <- names(utils::sessionInfo()$otherPkgs)
packages <- c(packages.base, packages.others)
for(i in 1:length(packages)) {
reference <- cat("`",
packages[i],
"`: ",
"*",
packageDescription(packages[i])$Title,
"* version ",
packageDescription(packages[i])$Version,
"<br />",
sep = "")
reference
}stats: The R Stats Package version
4.1.3graphics: The R Graphics Package
version 4.1.3grDevices: The R Graphics Devices
and Support for Colours and Fonts version
4.1.3utils: The R Utils Package version
4.1.3datasets: The R Datasets Package
version 4.1.3methods: Formal Methods and
Classes version 4.1.3base: The R Base
Package version 4.1.3glmnet: Lasso and
Elastic-Net Regularized Generalized Linear Models version
4.1-6Matrix: Sparse and Dense Matrix Classes and
Methods version 1.4-1summarytools: Tools to
Quickly and Neatly Summarize Data version
1.0.1kableExtra: Construct Complex Table with
‘kable’ and Pipe Syntax version 1.3.4rlist:
A Toolbox for Non-Tabular Data Manipulation version
0.4.6.2mlr: Machine Learning in R version
2.19.1ParamHelpers: Helpers for Parameters in
Black-Box Optimization, Tuning and Machine Learning version
1.14gridExtra: Miscellaneous Functions for “Grid”
Graphics version 2.3caret: Classification
and Regression Training version 6.0-93lattice:
Trellis Graphics for R version
0.20-45reshape2: Flexibly Reshape Data: A Reboot
of the Reshape Package version 1.4.4forcats:
Tools for Working with Categorical Variables (Factors) version
0.5.2stringr: Simple, Consistent Wrappers for
Common String Operations version 1.4.1dplyr:
A Grammar of Data Manipulation version
1.0.10purrr: Functional Programming Tools
version 0.3.4readr: Read Rectangular Text
Data version 2.1.3tidyr: Tidy Messy
Data version 1.2.1tibble: Simple Data
Frames version 3.1.8ggplot2: Create Elegant
Data Visualisations Using the Grammar of Graphics version
3.4.0tidyverse: Easily Install and Load the
‘Tidyverse’ version 1.3.2knitr: A
General-Purpose Package for Dynamic Report Generation in R version
1.41
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎